#Instalación rSFFreader
# source("http://bioconductor.org/biocLite.R")
# biocLite("rSFFreader")
require(rSFFreader)
library(gdata)
library(reshape2)
library(plotly)
library(ggplot2)

Ejemplo de libro

#ejemplo de libro
sff<- load454SampleData()
## Total number of reads to be read: 1000
## reading header for sff file:/home/sergio/R/x86_64-pc-linux-gnu-library/3.3/rSFFreader/extdata/Small454Test.sff
## reading file:/home/sergio/R/x86_64-pc-linux-gnu-library/3.3/rSFFreader/extdata/Small454Test.sff
##Generate some QA plots:
##Read length histograms:
par(mfrow=c(2,2))
clipMode(sff) <- "raw"
hist(width(sff),breaks=500,col="grey",xlab="Read Length",main="Raw Read Length")
## Base by position plots:
clipMode(sff) <- "raw"
ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
ac.reads <- apply(ac,2,sum)
acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
        type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
        main="Base by position")
cols <- c("green","blue","black","red","darkgrey","purple")
leg <- c("A","C","T","G","N","%reads")
legend("topright", col=cols, legend=leg, pch=18, cex=.8)
clipMode(sff) <- "full"
hist(width(sff),breaks=500,col="grey",xlab="Read Length",main="Clipped Read Length")
ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
ac.reads <- apply(ac,2,sum)
acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
        type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
        main="Base by position")
legend("topright", col=cols, legend=leg, pch=18, cex=.8)

IMPORT DATA

SIN CLIP - RAW DATA ANALYSIS

samples<-list.files("./samples")


lenStat_raw<-vector()
lenStat_qual<-vector()
nucFreq_raw<-matrix(nrow=1000, ncol=1)
nucFreq_qual<-matrix(nrow=1000, ncol=1)

count_raw<-data.frame()
count_qual<-data.frame()
len_raw<-list()
len_qual<-list()


for(i in samples){
  sff<-readSff(paste("samples/",i,sep=""), use.qualities=TRUE, use.names=TRUE,clipMode = c("raw"), verbose=TRUE)
##Generate some QA plots:
  ##Read length histograms (with and without clipping):
  
  # RAW
  par(mfrow=c(2,2))
  clipMode(sff) <- "raw"
  hist(width(sff),breaks=500,col="grey",xlab="Read Length",
       xlim= c(50,800), main= "RAW read length")
  lenStat_raw<-rbind(lenStat_raw, c(gsub("454Reads.MID_","",gsub(".sff","",i)), summary(width(sff)) ) )
  count_raw[i,1]<-length(sff)
  len_raw[[i]]<-width(sff)
    ## Base by position plots:
  ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
  ac.reads <- apply(ac,2,sum)
  acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
  tacf<-t(acf); colnames(tacf)<-paste(gsub("454Reads.MID_","",gsub(".sff","",i)),"_",  c("A","C","T","G","N"),sep="")
  nucFreq_raw<-cbindX(nucFreq_raw,tacf)
  matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
        type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
        main="Base by position", xlim=c(0,1500))
  cols <- c("green","blue","black","red","darkgrey","purple")
  leg <- c("A","C","T","G","N","%reads")
  legend("topright", col=cols, legend=leg, pch=18, cex=.8)
  
  ### FILTER QUALITY
  clipMode(sff) <- "full"
  hist(width(sff),breaks=500,col="grey",xlab="Read Length",
       xlim= c(50,800),main="CLIPPED read length" )
  lenStat_qual<-rbind(lenStat_qual,c(gsub("454Reads.MID_","",gsub(".sff","",i)), summary(width(sff)) ) )
  count_qual[i,1]<-length(sff)
  len_qual[[i]]<-width(sff)
  ## Base by position plots:
  ac <- alphabetByCycle(sread(sff),alphabet=c("A","C","T","G","N"))
  ac.reads <- apply(ac,2,sum)
  acf <- sweep(ac,MARGIN=2,FUN="/",STATS=apply(ac,2,sum))
    tacf<-t(acf); colnames(tacf)<-paste(gsub("454Reads.MID_","",gsub(".sff","",i)),"_",  c("A","C","T","G","N"),sep="")
  nucFreq_qual<-cbindX(nucFreq_qual,tacf)
  matplot(cbind(t(acf),ac.reads/ac.reads[1]),col=c("green","blue","black","red","darkgrey","purple"),
        type="l",lty=1,xlab="Base Position",ylab="Base Frequency",
        main="Base by position", xlim=c(0,1000))
  par(mfrow=c(1,1))
  legend("topright", col=cols, legend=leg, pch=18, cex=.8)
  title(paste("sample: ",gsub("454Reads.MID_","",gsub(".sff","",i)),sep=""))
  dev.copy(png,filename=paste("Explo_",gsub("454Reads.MID_","",gsub(".sff","",i)),".png",sep=""));
  dev.off ();
}
## Total number of reads to be read: 2233
## reading header for sff file:samples/454Reads.MID_s01_rs3.sff
## reading file:samples/454Reads.MID_s01_rs3.sff

## Total number of reads to be read: 2405
## reading header for sff file:samples/454Reads.MID_s02_rs3.sff
## reading file:samples/454Reads.MID_s02_rs3.sff

## Total number of reads to be read: 2054
## reading header for sff file:samples/454Reads.MID_s03_rs5.sff
## reading file:samples/454Reads.MID_s03_rs5.sff

## Total number of reads to be read: 11821
## reading header for sff file:samples/454Reads.MID_s04_rl3.sff
## reading file:samples/454Reads.MID_s04_rl3.sff

## Total number of reads to be read: 10580
## reading header for sff file:samples/454Reads.MID_s05_rl3.sff
## reading file:samples/454Reads.MID_s05_rl3.sff

## Total number of reads to be read: 8869
## reading header for sff file:samples/454Reads.MID_s06_rl5.sff
## reading file:samples/454Reads.MID_s06_rl5.sff

## Total number of reads to be read: 9255
## reading header for sff file:samples/454Reads.MID_s07_dl3.sff
## reading file:samples/454Reads.MID_s07_dl3.sff

## Total number of reads to be read: 9579
## reading header for sff file:samples/454Reads.MID_s08_dl3.sff
## reading file:samples/454Reads.MID_s08_dl3.sff

## Total number of reads to be read: 9987
## reading header for sff file:samples/454Reads.MID_s09_dl5.sff
## reading file:samples/454Reads.MID_s09_dl5.sff

Estadísticas de las lecturas

#Statistics about lenght
as.data.frame(lenStat_raw)
##        V1 Min. 1st Qu. Median  Mean 3rd Qu. Max.
## 1 s01_rs3  198     577    603 623.1     648 1093
## 2 s02_rs3  193     572    600 622.5     652 1199
## 3 s03_rs5  200     603    644 654.4     691 1121
## 4 s04_rl3  533     686    694 695.8     702  929
## 5 s05_rl3  499     687    695 697.3     704 1133
## 6 s06_rl5  488     696    703   703     711 1779
## 7 s07_dl3  526     687    695 697.7     704 1477
## 8 s08_dl3  509     683    690 692.2     698 1211
## 9 s09_dl5  487     695    702 701.1     710  851
write.csv(as.data.frame(lenStat_raw), "Raw_Readlength_stats.csv",row.names = FALSE )
as.data.frame(lenStat_qual)
##        V1 Min. 1st Qu. Median  Mean 3rd Qu. Max.
## 1 s01_rs3   30     247    310 301.4     364  540
## 2 s02_rs3   30     244    300 294.3     348  524
## 3 s03_rs5   30     219    261 256.7     329  512
## 4 s04_rl3  379     613    626 598.7     630  668
## 5 s05_rl3  390     615    627 600.3     631 1114
## 6 s06_rl5  382     560    635 596.2     636  710
## 7 s07_dl3  382     618    628 602.3     631  664
## 8 s08_dl3  381     622    627 603.1     630  683
## 9 s09_dl5  382     568    635 601.1     636  654
write.csv(as.data.frame(lenStat_qual), "Clip_Readlength_stats.csv",row.names = FALSE )

#Nucleotide frequency by position
write.csv(nucFreq_raw, "Raw_NuclFreqByPos.csv",row.names = FALSE )
write.csv(nucFreq_qual, "Clip_NuclFreqByPos.csv",row.names = FALSE )

#All length data by sample
count_raw$sample<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_raw)))
count_qual$sample<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_qual)))
length_raw<-do.call(cbindX, lapply(len_raw, as.data.frame))
colnames(length_raw)<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_raw)))
length_qual<-do.call(cbindX, lapply(len_qual, as.data.frame))
colnames(length_qual)<-gsub("454Reads.MID_","",gsub(".sff","",rownames(count_qual)))
rownames(count_raw)<-NULL;rownames(count_qual)<-NULL


write.csv(length_raw, "Raw_AllReadlength.csv",row.names = FALSE )
write.csv(length_qual, "Clip_AllReadlength.csv",row.names = FALSE )

#AMount of sequences by sample
count_raw
##      V1  sample
## 1  2233 s01_rs3
## 2  2405 s02_rs3
## 3  2054 s03_rs5
## 4 11821 s04_rl3
## 5 10580 s05_rl3
## 6  8869 s06_rl5
## 7  9255 s07_dl3
## 8  9579 s08_dl3
## 9  9987 s09_dl5
write.csv(count_raw, "Raw_CountBySample.csv",row.names = FALSE )
count_qual
##      V1  sample
## 1  2233 s01_rs3
## 2  2405 s02_rs3
## 3  2054 s03_rs5
## 4 11821 s04_rl3
## 5 10580 s05_rl3
## 6  8869 s06_rl5
## 7  9255 s07_dl3
## 8  9579 s08_dl3
## 9  9987 s09_dl5
write.csv(count_qual, "Clip_CountBySample.csv",row.names = FALSE )
#GRAFICO DE BARRAS CON LA CANTIDAD DE LECTURAS
#plot(cantidad$cantidad, type="b")
library(ggplot2)
p<-ggplot(count_raw, aes(x=sample, weight=V1))+ geom_bar(fill="#2b8cbe")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p)
#GRAFICO DE CAJAS
#boxplot(longitud)
p<-ggplot(data = melt(length_raw), aes(x=variable, y=value)) + geom_boxplot(aes(fill=variable))+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+ ggtitle("Raw Reads")
ggplotly(p)
#GRAFICO DE CAJAS
#boxplot(longitud)
p<-ggplot(data = melt(length_qual), aes(x=variable, y=value)) + geom_boxplot(aes(fill=variable))+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+ ggtitle("Clipped Reads")
ggplotly(p)